Load the Required Packages:

Below, the packages required for data analysis and visualization are loaded.

library(tidyverse)
library(magrittr)
library(DBI)
library(dbplyr)
library(RMariaDB)
library(data.table)
library(stopwords)
library(tidytext)
library(RColorBrewer)
library(DT)
library(wordcloud)

State the Research Question:

W. Edwards Deming said, “In God we trust, all others must bring data.” Below, we will use data to explore the question, “Which are the most valued data science skills?”

Connect to the SQL Data Base:

con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  dbname = "dat_sci_jobs",
  username = "root",
  password = as.character(read.table("sql_db.txt", header = FALSE)),
  host = "35.227.102.234")

Read from the SQL Data Base and Disconnect:

tables <- dbListTables(con)
jobs_df <- dbReadTable(con, "_Jobs")
datatable(jobs_df[, -3], options = list(pageLength = 25))
dbDisconnect(con)

Create a Data Frame of Jobs Found via RSS Feed By Combining CSV Files and Removing Duplicates:

completed_files <- readLines("completed_files.txt")

url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/"

new_jobs_df <- as.data.frame(matrix(nrow = 0, ncol = 11))
files <- list.files(pattern = "_feeds_.*csv$")

for (i in 1:length(files)){
    if (!(files[i] %in% completed_files)){
        file <- paste(url_base, files[i], sep = "")
        csv <- read.csv(file = file, header = TRUE)
        new_jobs_df <- rbind(new_jobs_df, csv)
        completed_files <- append(completed_files, files[i])
    }
}

new_jobs_df2 <- as.data.frame(matrix(nrow = 0, ncol = 10))
files <- list.files(pattern = "_linkedin_.*csv$")

for (i in 1:length(files)){
    if (!(files[i] %in% completed_files)){
        file <- paste(url_base, files[i], sep = "")
        csv <- read.csv(file = file, header = TRUE)
        new_jobs_df2 <- rbind(new_jobs_df2, csv)
        completed_files <- append(completed_files, files[i])
    }
}

writeLines(completed_files, "completed_files.txt")

new_jobs_df <- new_jobs_df[!duplicated(new_jobs_df), ]
new_jobs_df2 <- new_jobs_df2[!duplicated(new_jobs_df2), ]

Remove Unnecessary Columns in Jobs Data Frame and Rearrange Remaining Columns to Prepare the Data for the SQL Data Base:

if (nrow(new_jobs_df) > 0){
    new_jobs_df <- subset(new_jobs_df, select = -c(X, author, summary,
        content, extracted_content_url, published, created_at))
    cols <- c("Job_id", "Site_id", "Job_title", "Job_url")
    colnames(new_jobs_df) <- cols
    Job_company <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
                                   ncol = 1))
    colnames(Job_company) <- "Job_company"
    Job_location <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df),
                                    ncol = 1))
    colnames(Job_location) <- "Job_location"
    new_jobs_df <- cbind(new_jobs_df, Job_company, Job_location)
    rownames(new_jobs_df) <- NULL
    new_jobs_df <- new_jobs_df[c("Job_id", "Job_title", "Job_url",
                                 "Job_company", "Job_location", "Site_id")]
    new_jobs_df %<>%
        mutate(Job_complete = 0)
}

if (nrow(new_jobs_df2) > 0){
    new_jobs_df2 <- subset(new_jobs_df2, select = -c(X, job_url, company_url,
        linkedin_company_url_cleaned, posted_date, normalized_company_name))
    cols <- c("Job_url", "Job_company", "Job_title", "Job_location")
    colnames(new_jobs_df2) <- cols
    Job_id <- as.data.frame(matrix(NA, nrow = nrow(new_jobs_df2),
                                   ncol = 1))
    colnames(Job_id) <- "Job_id"
    Site_id <- as.data.frame(matrix(1001, nrow = nrow(new_jobs_df2),
                                    ncol = 1))
    colnames(Site_id) <- "Site_id"
    new_jobs_df2 <- cbind(new_jobs_df2, Job_id, Site_id)
    rownames(new_jobs_df2) <- NULL
    new_jobs_df2 <- new_jobs_df2[c("Job_id", "Job_title", "Job_url",
                                   "Job_company", "Job_location", "Site_id")]
    new_jobs_df2 %<>%
        mutate(Job_complete = 0)
}

if (nrow(new_jobs_df) > 0){
    jobs_df <- rbindlist(list(jobs_df, new_jobs_df))[!duplicated(Job_url)]
}
if (nrow(new_jobs_df2) > 0){
    jobs_df <- rbindlist(list(jobs_df, new_jobs_df2))[!duplicated(Job_url)]
}
jobs_df %<>%
    mutate(Job_id = row_number())

Scrape Each Unique Job Listing URL:

for (i in 1:nrow(jobs_df)){
    httr::user_agent("Glen Davis")
    if (jobs_df[i, 7] == 0){
        dat <- try(xml2::read_html(jobs_df$Job_url[[i]]), silent = TRUE)
        if (inherits(dat, "try-error", which = FALSE)){
            jobs_df[i, 7] <- -1
            next
        }
    }else{
        next
    }
    if (jobs_df[i, 6] == 2594160){ #ai-jobs.net is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@id, 'job-description')]")
    }
    else if (jobs_df[i, 6] == 977141){ #python.org is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job-description')]")
    }
    else if (jobs_df[i, 6] == 2594162){ #careercast it & eng is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'arDetailDescriptionRow')]")
    }
    else if (jobs_df[i, 6] == 1378327){ #jobs for r-users is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'section_content')]")
    }
    else if (jobs_df[i, 6] == 2593879){ #Indeed is source
        jobs_df[i, 7] <- -1
        next
    }
    else if (jobs_df[i, 6] == 2594166){ #Open Data Science is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job-desc')]")
    }
    else if (jobs_df[i, 6] == 2594174){ #MLconf is source
        desc <- xml2::xml_find_all(
            dat, "//div[contains(@class, 'job_description')]")
    }
    else if (jobs_df[i, 6] == 1001){ #Linkedin is source
        jobs_df[i, 7] <- -1
        next
    }
    
    desc <- xml2::xml_text(desc)
    fn <- paste(jobs_df[i, 1], ".txt", sep = "")
    writeLines(desc, con = fn)
    jobs_df[i, 7] <- 1
}

Manual Data Collection for LinkedIn:

manual <- jobs_df %>%
    filter(Job_complete == -1 & Site_id == 1001)
write.csv(manual, "manual.csv", row.names = FALSE)

We look up the job descriptions for the job listings in the manual.csv file manually, and we save them as column eight of a manual_edited.csv file. If we find a job description, we change the Job_complete value to 1. If we don’t, we can just delete that row. If a previous file exists, we save over it. We then upload the saved manual_edited.csv file to github before continuing.

file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/manual_edited.csv"

manual_edited <- read.csv(file = file, header = TRUE)
for (i in 1:nrow(manual_edited)){
    job_id <- manual_edited[i, 1]
    if (jobs_df[job_id, 7] != 1){
        job_desc <- manual_edited[i, 8]
        jobs_df[job_id, 7] <- manual_edited[i, 7]
        fn <- paste(job_id, ".txt", sep = "")
        writeLines(job_desc, con = fn)
    }
}

Add Previously Scraped (Nov. 2022) Data for Indeed Jobs (Source: Kaggle)

completed_files <- readLines("completed_files.txt")

if (!("data_science_jobs_indeed_usa.csv" %in% completed_files)){
    file <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/data_science_jobs_indeed_usa.csv"
    kaggle_indeed <- read.csv(file = file, header = TRUE)
    kaggle_indeed <- subset(kaggle_indeed, select = -c(1, 5, 6, 7, 8))
    cols <- c("Job_title", "Job_company", "Job_location", "Job_url",
              "Job_description")
    colnames(kaggle_indeed) <- cols
    ids <- seq((nrow(jobs_df) + 1), (nrow(jobs_df) + nrow(kaggle_indeed)))
    Job_id <- as.data.frame(matrix(ids, nrow = nrow(kaggle_indeed),
                                   ncol = 1))
    Site_id <- as.data.frame(matrix(2593879, nrow = nrow(kaggle_indeed),
                                    ncol = 1))
    Job_complete <- as.data.frame(matrix(0, nrow = nrow(kaggle_indeed),
                                         ncol = 1))
    colnames(Site_id) <- "Site_id"
    colnames(Job_id) <- "Job_id"
    colnames(Job_complete) <- "Job_complete"
    kaggle_indeed <- cbind(kaggle_indeed, Job_id, Site_id, Job_complete)
    rownames(kaggle_indeed) <- NULL
    kaggle_indeed <- kaggle_indeed[c("Job_id", "Job_title", "Job_url",
                                     "Job_company", "Job_location",
                                     "Site_id", "Job_complete",
                                     "Job_description")]
    jobs_df <- rbind(jobs_df, subset(kaggle_indeed, select = 1:7))
    for (i in 1:nrow(kaggle_indeed)){
        job_id <- kaggle_indeed[i, 1]
        job_desc <- kaggle_indeed[i, 8]
        jobs_df[job_id, 7] <- 1
        fn <- paste(job_id, ".txt", sep = "")
        writeLines(job_desc, con = fn)
    }
    write("data_science_jobs_indeed_usa.csv", file = "completed_files.txt",
          append = TRUE)
}

Connect to the SQL Data Base:

con <- DBI::dbConnect(
  RMariaDB::MariaDB(),
  dbname = "dat_sci_jobs",
  username = "root",
  password = as.character(read.table("sql_db.txt", header = FALSE)),
  host = "35.227.102.234")

Write to the SQL Data Base Only If There Were Changes and Disconnect:

tables <- dbListTables(con)
copy <- dbReadTable(con, "_Jobs")

if (!identical(jobs_df, copy)){
    dbWriteTable(con, "_Jobs", jobs_df, overwrite = TRUE)
}

dbDisconnect(con)

Set Up a Text Data Frame from the TXT Job Description Files

files <- list.files(pattern = "^[1-9]+.*txt$")
if (length(files) > 0){
    file.copy(from = paste0(getwd(), "/", files),
              to = paste0(getwd(), "/jobs-txt/", files))
    file.remove(from = paste0(getwd(), "/", files))
}

files <- list.files(path = paste0(getwd(), "/jobs-txt/"),
                    pattern = "^[1-9]+.*txt$")

cols <- c("Text", "Job_id", "Line")

completed_txt_files <- readLines("completed_txt_files.txt")
if (length(completed_txt_files) == 0){
    text_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
    colnames(text_df) <- cols
}else{
    my_url <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/text_df.csv"
    text_df <- read.csv(file = my_url, header = TRUE)
}

url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/jobs-txt/"

for (i in 1:length(files)){
    if (!(files[i] %in% completed_txt_files)){
        file <- paste(url_base, files[i], sep = "")
        job_id <- str_replace(files[i], ".txt", "")
        lines <- readLines(file)
        for (j in 1:length(lines)){
            col2 <- matrix(job_id, nrow = length(lines), ncol = 1)
            col3 <- matrix(1:length(lines),
                           nrow = length(lines),
                           ncol = 1)
        }
        addition <- cbind(lines, col2, col3)
        colnames(addition) <- cols
        text_df <- rbind(text_df, addition)
        write(files[i], file = "completed_txt_files.txt", append = TRUE)
    }
}

rownames(text_df) <- NULL
write.csv(text_df, "text_df.csv", row.names = FALSE)

Analyze the Text Data Frame with Tidytext

tidy_text_df_words <- text_df %>%
    unnest_tokens(word, Text)

tidy_text_words_analysis <- tidy_text_df_words %>%
    anti_join(get_stopwords()) %>%
    group_by(word) %>%
    summarize(term_freq = n(), doc_count = n_distinct(Job_id)) %>%
    arrange(desc(doc_count))
## Joining with `by = join_by(word)`
datatable(tidy_text_words_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_words_analysis %>%
    with(wordcloud(word, doc_count, max.words = 50))
## Warning in wordcloud(word, doc_count, max.words = 50): experience could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): working could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): including could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): business could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): technology could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): analytics could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): ability could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): data could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): engineering could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): environment could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): years could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): building could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): support could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): development could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): related could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): using could not be fit
## on page. It will not be plotted.
## Warning in wordcloud(word, doc_count, max.words = 50): work could not be fit on
## page. It will not be plotted.

tidy_text_df_bigrams <- text_df %>%
    unnest_tokens(bigram, Text, token = "ngrams", n = 2)

tidy_text_bigrams_analysis <- tidy_text_df_bigrams %>%
    group_by(bigram) %>%
    summarize(term_freq = n(), doc_count = n_distinct(Job_id)) %>%
    arrange(desc(doc_count))

datatable(tidy_text_bigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_bigrams_analysis %>%
    with(wordcloud(bigram, doc_count, max.words = 50))
## Warning in wordcloud(bigram, doc_count, max.words = 50): years of could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): ability to could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): looking for could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): equal opportunity
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): committed to could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(bigram, doc_count, max.words = 50): we are could not be
## fit on page. It will not be plotted.

tidy_text_df_trigrams <- text_df %>%
    unnest_tokens(trigram, Text, token = "ngrams", n = 3)

tidy_text_trigrams_analysis <- tidy_text_df_trigrams %>%
    group_by(trigram) %>%
    summarize(term_freq = n(), doc_count = n_distinct(Job_id)) %>%
    arrange(desc(doc_count))

datatable(tidy_text_trigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_trigrams_analysis %>%
    with(wordcloud(trigram, doc_count, max.words = 50))